{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sequence Generation and Alignment Analysis with Evo2\n",
    "This notebook demonstrates how to generate biological sequences using the Evo2 model and analyze them using Biopython alignments.\n",
    "\n",
    "## Setup and Dependencies\n",
    "\n",
    "First, let's import our required libraries and set up our environment. Note you need Jupyter to run notebooks.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import argparse\n",
    "import csv\n",
    "from pathlib import Path\n",
    "from typing import List, Optional, Tuple\n",
    "import numpy as np\n",
    "import torch\n",
    "import torch.nn.functional as F\n",
    "from Bio import pairwise2\n",
    "from Bio.pairwise2 import format_alignment\n",
    "from Bio.Seq import Seq\n",
    "\n",
    "from evo2 import Evo2\n",
    "\n",
    "# Set random seeds for reproducibility\n",
    "torch.manual_seed(42)\n",
    "torch.cuda.manual_seed(42)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model Initialization\n",
    "Let's initialize our Evo2 model. We'll use the 7B parameter version as a default."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 32263.88it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found complete file in repo: evo2_7b.pt\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 32/32 [00:00<00:00, 180.96it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Extra keys in state_dict: {'blocks.2.mixer.mixer.filter.t', 'blocks.16.mixer.mixer.filter.t', 'blocks.20.mixer.mixer.filter.t', 'blocks.9.mixer.mixer.filter.t', 'blocks.27.mixer.mixer.filter.t', 'blocks.17.mixer.dense._extra_state', 'blocks.31.mixer.attn._extra_state', 'blocks.24.mixer.dense._extra_state', 'blocks.17.mixer.attn._extra_state', 'blocks.13.mixer.mixer.filter.t', 'blocks.10.mixer.attn._extra_state', 'blocks.10.mixer.dense._extra_state', 'blocks.30.mixer.mixer.filter.t', 'blocks.31.mixer.dense._extra_state', 'blocks.24.mixer.attn._extra_state', 'blocks.3.mixer.dense._extra_state', 'blocks.3.mixer.attn._extra_state', 'unembed.weight', 'blocks.6.mixer.mixer.filter.t', 'blocks.23.mixer.mixer.filter.t'}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/gbrixi/miniconda/envs/hf_tracking_test/lib/python3.11/site-packages/transformer_engine/pytorch/module/base.py:630: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
      "  state = torch.load(state, map_location=\"cuda\")\n",
      "/home/gbrixi/evo2/vortex/vortex/model/utils.py:153: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
      "  return torch_load(state, map_location=device)\n"
     ]
    }
   ],
   "source": [
    "model_name = 'evo2_7b'\n",
    "\n",
    "model = Evo2(model_name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data Loading\n",
    "Next we'll create functions to load our example sequences\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded 4 sequence pairs\n"
     ]
    }
   ],
   "source": [
    "def read_sequences(input_file: Path) -> Tuple[List[str], List[str]]:\n",
    "    \"\"\"\n",
    "    Read input and target sequences from CSV file.\n",
    "    \n",
    "    Expected CSV format:\n",
    "    input_sequence,target_sequence\n",
    "    ACGTACGT,ACGTACGTAA\n",
    "    ...\n",
    "    \"\"\"\n",
    "    input_seqs: List[str] = []\n",
    "    names: List[str] = []\n",
    "    \n",
    "    with open(input_file, encoding='utf-8-sig', newline='') as csvfile:\n",
    "        reader = csv.reader(csvfile)\n",
    "        next(reader)  # Skip header\n",
    "        for row in reader:\n",
    "            input_seqs.append(row[0])\n",
    "            if len(row) > 1:\n",
    "                names.append(row[1])\n",
    "    \n",
    "    return input_seqs, names\n",
    "\n",
    "# Load example data\n",
    "\n",
    "sequences, names = read_sequences('../evo2/test/data/prompts.csv')\n",
    "\n",
    "# For 'autocomplete', we split the data into input and target sequences\n",
    "\n",
    "input_seqs = [seq[:500] for seq in sequences]\n",
    "target_seqs = [seq[500:1000] for seq in sequences]\n",
    "\n",
    "print(f\"Loaded {len(sequences)} sequence pairs\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Now it's time to generate!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initializing inference params with max_seqlen=1000\n",
      "Prompt: \"GAATAGGAACAGCTCCGGTCTACAGCTCCCAGCGTGAGCGACGCAGAAGACGGTGATTTCTGCATTTCCATCTGAGGTACCGGGTTCATCTCACTAGGGAGTGCCAGACAGTGGGCGCAGGCCAGTGTGTGTGCGCACCGTGCGCGAGCCGAAGCAGGGCGAGGCATTGCCTCACCTGGGAAGCGCAAGGGGTCAGGGAGTTCCCTTTCCGAGTCAAAGAAAGGGGTGATGGACGCACCTGGAAAATCGGGTCACTCCCACCCGAATATTGCGCTTTTCAGACCGGCTTAAGAAACGGCGCACCACGAGACTATATCCCACACCTGGCTCAGAGGGTCCTACGCCCACGGAATCTCGCTGATTGCTAGCACAGCAGTCTGAGATCAAACTGCAAGGCGGCAACGAGGCTGGGGGAGGGGCGCCCGCCATTGCCCAGGCTTGCTTAGGTAAACAAAGCAGCCGGGAAGCTCGAACTGGGTGGAGCCCACCACAGCTCAAGG\",\tOutput: \"AGGCCTGCCTGCCTCTGTAGGCTCCACCTCCGGGGGAAGGGCACAGCCCAACAAAAGGCGGCAGACACCTCTGCAGACTTAAATGTCCCTGTCTGACAGCTTTGAAGAGAGCAGTGGTTCTCCTAGCACGCAGCTGGAGATCTGAGAACGGGCAGACTGCCTCCTCAAGTGGGTCCCTGACCCCTGACCCCCGAGCAGCCTAACTGGGAGGCACCCCCCAGCAGGGGCACACTGACACCTCACACGGCAGGGTATTCCAACAGACCTGCAGCTGAGGATCCTGTCTGCAAGACAGCTTAGGCCCTACAACAGTCTTGCAGCCACCTCTACTGATGTAGGAAAGCCTGCCTGCCTCTGTAGGCTCCACCTCTGGGAGCAGGGCATAGACAAACAAAAAGAGGCAGCAGCAGCCTCAGCAGACAGAAACCATACCGCCTGGCAGCTTTGAAGAGAGCAGTGGATCTCCCAACACGGAGGTTGAGATCTGAGAACGGACAGAC\",\tScore: -0.26270025968551636\n",
      "Prompt: \"GACACCATCGAATGGCGCAAAACCTTTCGCGGTATGGCATGATAGCGCCCGGAAGAGAGTCAATTCAGGGTGGTGAATGTGAAACCAGTAACGTTATACGATGTCGCAGAGTATGCCGGTGTCTCTTATCAGACCGTTTCCCGCGTGGTGAACCAGGCCAGCCACGTTTCTGCGAAAACGCGGGAAAAAGTGGAAGCGGCGATGGCGGAGCTGAATTACATTCCCAACCGCGTGGCACAACAACTGGCGGGCAAACAGTCGTTGCTGATTGGCGTTGCCACCTCCAGTCTGGCCCTGCACGCGCCGTCGCAAATTGTCGCGGCGATTAAATCTCGCGCCGATCAACTGGGTGCCAGCGTGGTGGTGTCGATGGTAGAACGAAGCGGCGTCGAAGCCTGTAAAGCGGCGGTGCACAATCTTCTCGCGCAACGCGTCAGTGGGCTGATCATTAACTATCCGCTGGATGACCAGGATGCCATTGCTGTGGAAGCTGCCTGCAC\",\tOutput: \"TAATGTTCCGGCGTTGTTTCTTGATGTCTCTGACCAGACTTCCGTTAACAGTATTATTTTCTCCCATGAAGACGGTACGCGACTGGGCGTGGAACATCTGATCGCATTAGGTCACCAGCAAATCGCGCTGTTAGCGGGGCCATTAAGTTCTGTCTCGGCGCGTCTGAGGCTGGCGGGCTGGCATAAATATCTCACTCGCAACCATATCCAGCCGATAGCGGTACGGGAAGGCGACTGGAGTGCCATGTCCGGTTATCAACAAACGATGGAAATGCTGAATAACGGCATCGTACCGTCGGCGATGCTGGTTGCCAACGATCAGATGGCGCTGGGCGCAATGCGCGCACTGGAAGAACATAAACTTTCGGTACCGGAAGATATCTCGGTGATTGGTTATGACGATACCGAAGACAGCTCGTGTTTTATTCCGCCGTTGACCACTATCAAGCAGGATTTTCGTCTGCTGGGGCAGACAGCTGTGGACCGCCTGCTGCAACTGA\",\tScore: -0.15721526741981506\n",
      "Prompt: \"GTTAATGTAGCTTAAAACAAAAGCAAGGTACTGAAAATACCTAGACGAGTATATCCAACTCCATAAACAACAAAGGTTTGGTCCCGGCCTTCTTATTGGTTACTAGGAAACTTATACATGCAAGTATCCGCCCGCCAGTGAATACGCCTTCTAAATCATCACTGATCAAAGAGAGCTGGCATCAAGCACACACCCCAAGTGTAGCTCATGACGTCTCGCCTAGCCACACCCCCACGGGAAACAGCAGTAGTAAATATTTAGCAATTAACAAAAGTTAGACTAAGTTATCCTAATAAAGGACTGGTCAATTTCGTGCCAGCAACCGCGGCCATACGATTAGTCCAAATTAATAAGCATACGGCGTAAAGCGTATTAGAAGAATTAAAAAAATAAAGTTAAATCTTATACTAGCTGTTTAAAGCTCAAGATAAGACATAAATAGCCTACGAAAGTGACTTTAATAATCCTAAACATACGATAGCTAGGGTACAAACTGAGAT\",\tOutput: \"TAGATACCTCACTATGCTTAGCCATAAACCTAGGCAGAGTATAACCAATCTGCCAGCCAGAGTACTACTAGCAATAGCTTAAAACTCAAAGGACTTGGCGGTGCTTTATATCCACCTAGAGGAGCCTGTTCTATAATCGATAAACCCCGATAAACCTTACCACTTTTTGCTAATACAGTCTATATACCGCCATCTTCAGCAAACCCTTAAAAGGAATCACAGTAAGCAAAAACTTAGCACATAGGAACGTTAGGTCAAGGTGTAACCTATAAAGTGGTAAGAAATGGGCTACATTTTTTTAATTAAAAACACATTCTATACTAAACCTATGAAAATATTAAGCCTAAGGTGGATTTAGTAGTAAATTAAGAATAGAGAGCTTAATTGAATGAGAAAATTGGGCGCACACAATGCCCGTCACCCTCCTCAAATAATTATTACACAGTATAAAATACCATTAAAACAAAATCAACCAAAAAGGAGAAAAGTCGTAACAAGGT\",\tScore: -0.48031729459762573\n",
      "Prompt: \"GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTATGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACTTACTAAAGTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCACTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAACCCCAAAAACAAAGAACCCTAACACCAGCCTAACCAGATTTCAAATTTTATCTTTTGGCGGTATGCACTTTTAACAGTCACCCCCCAACTAACACATTATTTTCCCCTCCCACTCCCATACTACTAATCTCATCAATACAACCCCCGC\",\tOutput: \"GTATCTGATATGGATACGGTGAATAGTGGCTTCTTCAGTGTGCATCGTATTCATAGAAGAAACGATTTTTTTCGATACTGATTTCATTAGTGACTACGCTATCGGGCCTGCACTCTTTAAGGGATACAGATTCGTCGACTGTATTGGCCATTTGGCCACAGTGTGACCACATGACCATAATCTACATAGGTCTACATCATCGATAGGATTGCATCGGGGAGACGATACGGGGACAGGTATGTATTCATGAGCGTTGATGTCGCCACGGATGCGTTAAGCATATCCGCTCCGGATCCTCGCTGGATGGGGTATTACTGTTTATGAATGTCTTGTCTGCTCAAAGGCCGCGCTGCAGCAAAATACCCAAATGTCCAAAATGTGGGACATTTGCACATTGGGTGACAGTTGTTCGCAAAAGGCGCGCCCGAAAGGCATAATTCGGCCAACCCCAAATTCACAACTGCAAAACAGTAGTATAGTGATCGACTATATTTGCATTA\",\tScore: -1.3735215663909912\n",
      "['AGGCCTGCCTGCCTCTGTAGGCTCCACCTCCGGGGGAAGGGCACAGCCCAACAAAAGGCGGCAGACACCTCTGCAGACTTAAATGTCCCTGTCTGACAGCTTTGAAGAGAGCAGTGGTTCTCCTAGCACGCAGCTGGAGATCTGAGAACGGGCAGACTGCCTCCTCAAGTGGGTCCCTGACCCCTGACCCCCGAGCAGCCTAACTGGGAGGCACCCCCCAGCAGGGGCACACTGACACCTCACACGGCAGGGTATTCCAACAGACCTGCAGCTGAGGATCCTGTCTGCAAGACAGCTTAGGCCCTACAACAGTCTTGCAGCCACCTCTACTGATGTAGGAAAGCCTGCCTGCCTCTGTAGGCTCCACCTCTGGGAGCAGGGCATAGACAAACAAAAAGAGGCAGCAGCAGCCTCAGCAGACAGAAACCATACCGCCTGGCAGCTTTGAAGAGAGCAGTGGATCTCCCAACACGGAGGTTGAGATCTGAGAACGGACAGAC', 'TAATGTTCCGGCGTTGTTTCTTGATGTCTCTGACCAGACTTCCGTTAACAGTATTATTTTCTCCCATGAAGACGGTACGCGACTGGGCGTGGAACATCTGATCGCATTAGGTCACCAGCAAATCGCGCTGTTAGCGGGGCCATTAAGTTCTGTCTCGGCGCGTCTGAGGCTGGCGGGCTGGCATAAATATCTCACTCGCAACCATATCCAGCCGATAGCGGTACGGGAAGGCGACTGGAGTGCCATGTCCGGTTATCAACAAACGATGGAAATGCTGAATAACGGCATCGTACCGTCGGCGATGCTGGTTGCCAACGATCAGATGGCGCTGGGCGCAATGCGCGCACTGGAAGAACATAAACTTTCGGTACCGGAAGATATCTCGGTGATTGGTTATGACGATACCGAAGACAGCTCGTGTTTTATTCCGCCGTTGACCACTATCAAGCAGGATTTTCGTCTGCTGGGGCAGACAGCTGTGGACCGCCTGCTGCAACTGA', 'TAGATACCTCACTATGCTTAGCCATAAACCTAGGCAGAGTATAACCAATCTGCCAGCCAGAGTACTACTAGCAATAGCTTAAAACTCAAAGGACTTGGCGGTGCTTTATATCCACCTAGAGGAGCCTGTTCTATAATCGATAAACCCCGATAAACCTTACCACTTTTTGCTAATACAGTCTATATACCGCCATCTTCAGCAAACCCTTAAAAGGAATCACAGTAAGCAAAAACTTAGCACATAGGAACGTTAGGTCAAGGTGTAACCTATAAAGTGGTAAGAAATGGGCTACATTTTTTTAATTAAAAACACATTCTATACTAAACCTATGAAAATATTAAGCCTAAGGTGGATTTAGTAGTAAATTAAGAATAGAGAGCTTAATTGAATGAGAAAATTGGGCGCACACAATGCCCGTCACCCTCCTCAAATAATTATTACACAGTATAAAATACCATTAAAACAAAATCAACCAAAAAGGAGAAAAGTCGTAACAAGGT', 'GTATCTGATATGGATACGGTGAATAGTGGCTTCTTCAGTGTGCATCGTATTCATAGAAGAAACGATTTTTTTCGATACTGATTTCATTAGTGACTACGCTATCGGGCCTGCACTCTTTAAGGGATACAGATTCGTCGACTGTATTGGCCATTTGGCCACAGTGTGACCACATGACCATAATCTACATAGGTCTACATCATCGATAGGATTGCATCGGGGAGACGATACGGGGACAGGTATGTATTCATGAGCGTTGATGTCGCCACGGATGCGTTAAGCATATCCGCTCCGGATCCTCGCTGGATGGGGTATTACTGTTTATGAATGTCTTGTCTGCTCAAAGGCCGCGCTGCAGCAAAATACCCAAATGTCCAAAATGTGGGACATTTGCACATTGGGTGACAGTTGTTCGCAAAAGGCGCGCCCGAAAGGCATAATTCGGCCAACCCCAAATTCACAACTGCAAAACAGTAGTATAGTGATCGACTATATTTGCATTA']\n"
     ]
    }
   ],
   "source": [
    "generations = model.generate(\n",
    "    input_seqs,\n",
    "    n_tokens=500,\n",
    "    temperature=1.0,\n",
    ")\n",
    "\n",
    "generated_seqs = generations.sequences\n",
    "print(generated_seqs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Alignment Analysis\n",
    "### Let's analyze our generated sequences using Biopython's alignment tools."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Sequence Alignments:\n",
      "\n",
      "Alignment 1 (L1RE2):\n",
      "AGGCCTGCCTGCCTCTGTAGGCTCCACCTCC-GGGGGA-AGGGCACAGCC-CAA-CAAAA-GGCG-GCAG-ACACCTCTGCAGACTTAAA-TGTCCCTGTCTGACAGCTTTGAAGAGAGCAGTGGTTCTCCT-AGCACGCAGCTGGAGATCTGAGAACGGGCAGACTGCCTCCTCAAGTGGGTCCCTGACCCCTGACCCCCGAGCAGCCTAACTGGGAGGCACCCCCCAGCAGGGGCACACTGACACCTCACACGGCAGGGTATTCCAACAGACCTGCAGCTGAGGA-TCCTGTCTGCA--AGACAG-----CTTAGG-C--CCTAC-AACAGTCTTGCAGCCACCTCTACTGAT--GTAGGAAAGCCTGCCTGCC-TCTGTAGGC-TCCACC-TC-TGGG---AG-C----AGGGCATAG--ACAAACA-A-AAAGA-GGCAG-------CAGCAGCCTCAGCAGACA------GAAAC-C---ATACCGCCT-G-GCAGC-T-T--TG------AAGAGA--GCAGTGGATC-TC-C---CAACACGG-----AGGT-TGAGATCTGAGAACGGACA-GAC---\n",
      "||||||||||||||||||||||||||||| | |||||  |||||||||   ||| ||||| |||  |||| | |||||||||||||| || ||||||||||||||||||||||||||||||||||||||||  |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||  |||||||||    ||| ||     | ||   |  || |  || ||    | |  ||  |||||  |   | |  ||| |       || ||||||  | | |||| || |      || |    ||    |||  | ||||  | ||||| ||  |       ||| |     | |||| |      ||||| |   | | |||   | || || | |  |       ||| ||  |||||   || || |   |||||  |     ||   || ||  || ||   || | ||    \n",
      "AGGCCTGCCTGCCTCTGTAGGCTCCACCT-CTGGGGG-CAGGGCACAG--ACAAACAAAAAGGC-AGCAGTA-ACCTCTGCAGACTT-AAGTGTCCCTGTCTGACAGCTTTGAAGAGAGCAGTGGTTCTCC-CAGCACGCAGCTGGAGATCTGAGAACGGGCAGACTGCCTCCTCAAGTGGGTCCCTGACCCCTGACCCCCGAGCAGCCTAACTGGGAGGCACCCCCCAGCAGGGGCACACTGACACCTCACACGGCAGGGTATTCCAACAGACCTGCAGCTGAGG-GTCCTGTCTG--TTAGA-AGGAAAAC-TA--ACAACC-A-GAA-AG----G-A--CA--TCTAC--A-CCG-A--AAA-C-------CCATCTGTA--CAT-CACCATCAT---CAAAGACCAAAAG----TAGATA-AAAC-CACAAAGATGG--GGAAAAAACAG-A-----A-CAGA-AAAACTGGAAACTCTAAA-A-CGC--AGAGC-GCCTCTCCT-CCTCCAAAG-GAACGCAGT---TCCTCACCAGCAACA--GAACAAAG--CTG-GA--TG-GA---GA-ATGA-TTT\n",
      "  Score=772.4\n",
      "\n",
      "Sequence similarity: 82.80%\n",
      "Alignment score: 772.40\n",
      "\n",
      "Alignment 2 (ECOLAC):\n",
      "TAATGTTCCGGCGTTG-TTTCTTGATGTCTCTGACCAG--ACTTCCGT-T-AACAGTATTATTTTCTCCCATGAAGACGGTACGCGACTGGGCGTGGAA-CATCTGA-TCGCATTAGG-TCACCAGCAAATCGCGCTGTTAGCGGGGCC-ATTAAGTTCTGTCTCGGCGCGTCTGAG-G-CTGGCG-GGCTGGCATAAATATCTCACTCGCAAC-C--ATATCCAGCCGATAGCGGTA-CGGGAAGGCGACTGGAGTGCCATGTCCGGTTAT-CAACAAACG-ATGG-AAATGCTGAAT-AACGG-CATCGT----ACCGTCGGCGATGCTGGTTGCCAACGATCAGATGGCGCTGGGCGCAATGCGCGC----AC----T--GGAAGAACATAAA---C-TTTCGGTAC-CGGAAGATATCTCGGT-GATTGGTT-AT--GACGATACCGAAGACAGCTCG-TGTTTTAT-TCC-GCCGTTG--ACCACT-ATCAAG-CAGGATTTTCGTC-TGCTGGGGCAGA--CAGCTGTGGACCGCCT-GCTGCAA--CTGA\n",
      "|||||||||||||||  |||||||||||||||||||||  ||  ||   | |||||||||||||||||||||||||||||||||||||||||||||| | ||||||  ||||||| || |||||||||||||||||||||||| ||||| ||||||||||||||||||||||||  | | |||||  ||||||||||||||||||||||||||  |  || | ||||||||||||| | ||||||||||||||||||||||||||||||| | ||||||||  || | ||||||||||| |  || ||||||    ||  |  |||||||||||||||||||||||||||||||||||||||||||||||    ||    |  ||  |  | |      | ||  |||   ||   ||||||||||| |  |||   ||  ||||||||||||||||||||  ||  |||| ||| |||||    |||||  |||||  ||||||||||| | |||||||||| |  |||| |||||||| || |||||||  ||  \n",
      "TAATGTTCCGGCGTT-ATTTCTTGATGTCTCTGACCAGACAC--CC--ATCAACAGTATTATTTTCTCCCATGAAGACGGTACGCGACTGGGCGTGG-AGCATCTG-GTCGCATT-GGGTCACCAGCAAATCGCGCTGTTAGC-GGGCCCATTAAGTTCTGTCTCGGCGCGTCT--GCGTCTGGC-TGGCTGGCATAAATATCTCACTCGCAA-TCAAAT-T-CAGCCGATAGCGG-AACGGGAAGGCGACTGGAGTGCCATGTCCGGTT-TTCAACAAAC-CAT-GCAAATGCTGAATGA--GGGCATCGTTCCCAC--T--GCGATGCTGGTTGCCAACGATCAGATGGCGCTGGGCGCAATGCGCGCCATTACCGAGTCCGG--G--C-T---GCGCGTT--GGT--GCG---GATATCTCGGTAG--TGG--GATACGACGATACCGAAGACAGCTC-ATG--TTATATCCCGCCGT--CAACCAC-CATCAA-ACAGGATTTTCG-CCTGCTGGGGCA-AACCAGC-GTGGACCG-CTTGCTGCAACTCT--\n",
      "  Score=843.8\n",
      "\n",
      "Sequence similarity: 88.60%\n",
      "Alignment score: 843.80\n",
      "\n",
      "Alignment 3 (NC_007596.2Mammuthusprimigeniusmitochondrion):\n",
      "TAGATACCTCACTATGCT-TAGCCA-TAAACC--TAGGCAG-AG-TA----TAACCAA---TC-TGCCAGCCAGAGTA-CTACTAGC-AAT-AGCTTAAAACTC-AAAGGACTTGGCGGTGCTTTATATCCACCTAGAGG-AGCCTGT-TCTA-TAATC-GATA-AACCCCGATAA-ACCTTAC---CACTTTTTGCTAATA-CAGTCT-ATATACCG-CCATCTTCAGCAAACCC-TTAAA-AGG----AATCACAGTA-AGC--AA--A-AACTTAGCACAT---AGGAACGTTAGGTCA--AGGTGTAA--CCTATAAA-GT---GGT--AAGAAATGGGCTACATTTT-T-TTAATTA-AA-A-ACACATT-CT--ATACTAAAC-CTA-TGAAAATA------TTAAGCCTAAGGT-GGATTTAGTAGTAAAT-TAAGAATAGAGAGCTTAATTG---AATG---A-GAAAATTGGGCGC--ACACAAT--GCCCGTCACC-CTCCTCAA--A--T--A-ATT--ATTACA--CAGTATAAAATACCA---TTAAAACAAAATCAACCAAAA-AGGAGAA-AAGTCGTAACAAGGT\n",
      "|||||||||||||||||  |||||  |||| |  |  | |  || ||    ||  |||   |  | || ||||||| | |||||||| |   |||||||||||  |||||||||||||||||||||||||||||||| || ||||||| ||   ||| | |||  ||||||||| | |||||||   |||   ||||||||  |||||  |||||||  ||||||||||||||||| |   | |||    ||  | |||  |||  ||  | |||    | |||   |  || |||||| |   ||||||    |||    | ||   |||  |||  |||||||||||||| | |||  || || | ||| |   |   ||||    | ||  || ||||       ||  |   ||||  |||||||||||||||  |||||||||||||||||||||   || |   | |||      ||||  |||||    |||||||| | ||||||||  |  |  | | |  |  |||  || |||    || ||   || |||| ||||  ||   || ||||| | |||||||||||||| \n",
      "TAGATACCTCACTATGC-CTAGCC-CTAAA-CTTT--G-A-TAGCTACCTTTA--CAAAGCT-AT-CC-GCCAGAG-AACTACTAGCCA--GAGCTTAAAACT-TAAAGGACTTGGCGGTGCTTTATATCCACCTAG-GGGAGCCTGTCTC--GTAA-CCGAT-GAACCCCGAT-ACACCTTACCGTCAC---TTGCTAAT-TCAGTC-CATATACC-ACCATCTTCAGCAAACCCCT---ATAGGGCACAA--A-AGT-GAGCTTAATCATAAC----C-CATGAAA--AA-GTTAGG-C-CGAGGTGT--CGCCT----ACGTGACGGTCAAAG--ATGGGCTACATTTTCTATTA--TAGAATAGACA-A--AC-GGATAC----CACT-CTG-AAAT-GGGTGGTT--G---AAGG-CGGATTTAGTAGTAAA-CTAAGAATAGAGAGCTTAATTGAACAA-GGCCATGAA------GCGCGTACACA--CCGCCCGTCA-CTCTCCTCAAGTACCTCCACA-TCAA--ACAATCA-TAT----TA-CAGATTT-AAAC-AAAT--AC---AAGAGGAG-ACAAGTCGTAACAAGG-\n",
      "  Score=728.2\n",
      "\n",
      "Sequence similarity: 79.80%\n",
      "Alignment score: 728.20\n",
      "\n",
      "Alignment 4 (NC_012920.1_homosapiens_mitochondrion):\n",
      "GT--ATC-TGAT---ATGG-AT-ACGGTGA-AT-A--G-TGGCTT---CTTCAGTGTGCAT----CGTATT--CATAG--AAG-----AAACGATTTTTTTCGAT------ACTGA-TTTCAT-TAGTGACT-ACGCTATCGGGCCTGCACTCTTTAAGGG--ATACAGATTCGTCG--ACTGTATTGGCCATTT-GGCCACAGTGTGAC-CACATG-A--CCATAATCTAC--ATAGGT------CTA--CATCAT-CGATAGGATT-GCATCGGGG--AG--ACGAT-ACGGGGACAGGTATGTATTCATGAGCGTTGAT----GTCG-CCACG-GATGCGTTAAGCATATCCGCTCCGG---ATC-CTCG-CTGGAT-----GGGGTATTACTGTTTATGAATGTCTTGTCTGC-TCAAAGGC-CGC-GC--TGCAG--CAAAA----TA-C----CCA-AATGTCC--------AAA----ATGTGGGA-----CATTT-GCACATTGGGTG--AC---AGTTGTT--C---GC-A-A--AAGGCGCGCCCGAAAGG------CATAATT-CG-GCCA---ACC-C---CA-A--ATT---CACAACTG-CAA---AA-----CAGT--AGTA-TAGT----GATCGA----CT-----ATA----T-----T----TGCA--T-TA-------\n",
      "    ||| | |    |  | |  ||    | |  |  | | || |   |  |      |||    || |    || |   ||      ||| ||       | |       ||  | ||| || |||   || || ||      ||| ||      ||  |  |||||    | | |  |    | ||    ||| |   || | | | | |||||  |  ||||||   ||  ||||||      |||  | |  | |  |   ||| || ||      ||  | ||| ||    ||    |||    ||  |||    ||    ||   ||| | ||   |||   ||   || |||      ||| | |  |  |||     ||   |  ||        ||          || ||||  || ||| ||  |||||  |||||    || |    ||| |    ||        |||    | ||  ||     | ||| ||| ||        ||   |||  ||  |   || | |  ||    | |||   |||      |  |||| || ||||   ||| |   || |  |||   | |||  | |||   ||     | ||  || | | ||    |||| |    ||     |||    |     |    || |  | ||       \n",
      "--CCATCCT-A-CCCA--GCA-CAC----ACA-CACCGCT-GC-TAACC--C------CATACCCCG-A--ACCA-A-CCAA-ACCCCAAA-GA-------C-A-CCCCCCAC--AGTTT-ATGTAG---CTTAC-CT------CCT-CA------AA--GCAATACA----C-T-GAAA----A-TG----TTTAG---AC-G-G-G-CTCACAT-CACCCCATAA---ACAAATAGGTTTGGTCCTAGCC-T--TTC--T---ATTAGC-TC----TTAGTAA-GATTAC----AC----ATG----CA--AGC----ATCCCCGT--TCCA-GTGA---GTT---CA---CC-CTC---TAAATCAC-C-AC--GATCAAAAGG---A--AC--------AA----------GCATCAA--GCACGCAGCAATGCAGCTCAAAACGCTTAGCCTAGCCACA----CCCCCACGGGAAACAGCA-GT--GATTAACC-TTTAGCA-AT------AAACGAAAGT--TTAACTAAGCTATACTAA----C-CCC---AGGGTTGGTC--AATTTCGTGCCAGCCACCGCGGTCACACGATTAACC-CAA--GTCAATAGAAGCCGGC-GTAAAG-AGT-GTTTTAGATC-ACCCCCTCCCCAATAAAGCTAAAACTCACCTG-AGTTGTAAAAAACT\n",
      "  Score=497.2\n",
      "\n",
      "Sequence similarity: 60.40%\n",
      "Alignment score: 497.20\n"
     ]
    }
   ],
   "source": [
    "def analyze_alignments(generated_seqs: List[str],\n",
    "                       target_seqs: List[str],\n",
    "                       names: Optional[List[str]] = None\n",
    "                      ) -> List[dict]:\n",
    "    \"\"\"\n",
    "    Analyze and visualize alignments between generated and target sequences.\n",
    "    \n",
    "    Args:\n",
    "        generated_seqs: List of generated sequences\n",
    "        target_seqs: List of target sequences\n",
    "        names: Optional list of sequence names\n",
    "        \n",
    "    Returns:\n",
    "        List of alignment metrics for each sequence pair\n",
    "    \"\"\"\n",
    "    metrics = []\n",
    "    print(\"\\nSequence Alignments:\")\n",
    "    \n",
    "    for i, (gen_seq, target_seq) in enumerate(zip(generated_seqs, target_seqs)):\n",
    "        if names and i < len(names):\n",
    "            print(f\"\\nAlignment {i+1} ({names[i]}):\")\n",
    "        else:\n",
    "            print(f\"\\nAlignment {i+1}:\")\n",
    "        \n",
    "        gen_bio_seq = Seq(gen_seq)\n",
    "        target_bio_seq = Seq(target_seq)\n",
    "        \n",
    "        # Get alignments\n",
    "        alignments = pairwise2.align.globalms(\n",
    "            gen_bio_seq, target_bio_seq,\n",
    "            match=2,\n",
    "            mismatch=-1,\n",
    "            open=-0.5,\n",
    "            extend=-0.1\n",
    "        )\n",
    "        \n",
    "        best_alignment = alignments[0]\n",
    "        print(format_alignment(*best_alignment))\n",
    "        \n",
    "        matches = sum(a == b for a, b in zip(best_alignment[0], best_alignment[1]) \n",
    "                      if a != '-' and b != '-')\n",
    "        alignment_length = len(best_alignment[0].replace('-', ''))\n",
    "        similarity = (matches / len(target_seq)) * 100\n",
    "        \n",
    "        seq_metrics = {\n",
    "            'similarity': similarity,\n",
    "            'score': best_alignment[2],\n",
    "            'length': len(target_seq),\n",
    "            'gaps': best_alignment[0].count('-') + best_alignment[1].count('-')\n",
    "        }\n",
    "        \n",
    "        if names and i < len(names):\n",
    "            seq_metrics['name'] = names[i]\n",
    "            \n",
    "        metrics.append(seq_metrics)\n",
    "        \n",
    "        print(f\"Sequence similarity: {similarity:.2f}%\")\n",
    "        print(f\"Alignment score: {best_alignment[2]:.2f}\")\n",
    "    \n",
    "    return metrics\n",
    "\n",
    "# Analyze alignments\n",
    "alignment_metrics = analyze_alignments(generated_seqs, target_seqs, names)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate with species prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Species tag prompt: |D__ANIMALIA;P__CHORDATA;C__MAMMALIA;O__DIPROTODONTIA;F__PHASCOLARCTIDAE;G__PHASCOLARCTOS;S__PHASCOLARCTOS CINEREUS|\n",
      "Initializing inference params with max_seqlen=616\n",
      "Prompt: \"|D__ANIMALIA;P__CHORDATA;C__MAMMALIA;O__DIPROTODONTIA;F__PHASCOLARCTIDAE;G__PHASCOLARCTOS;S__PHASCOLARCTOS CINEREUS|\",\tOutput: \"TAGTACCCCGTCCAATATTCGGAAAACGAGAACTGGACGAACTGAACTTACTTCTTTGTTGATGCACGGGAAGGATCTTCAGCTTATCACCGTCGCGTCGATCAAGTTACTGACTCACAATTCTTCTTTCTCTTCGAGGTCCTTTTCTAGATTTGTAAAGTTACGTTAGGTATTAATATCTACCGCATGTTCCGTCCAAAGTAAACGCTCCCCCTAACTGCATTATATTAAGCCGAACCGAACGAAGTTGCGCAGAAACTATGAACGTTTCCGTATTTGCGGAAGATATCTCTCAACTCTCTGCAAACTGAAATAAGCCAGTGAATATAACAATGAACGTTTCCGTATTTGCGGAAGATATCTCTCAACTCTCTGCAAACTGAAATAAGCCAGTGAATATAACAATAGAAAACCTTCGCACCTTACATTCGCGTCCATTAGGTATGCAGGCAGTTCGGCCGGGCCGAAGAATAAGAAGCCACCCCAACTCTGCAAAAAAA\",\tScore: -1.2092068195343018\n",
      "Koala sequence:\n",
      "TAGTACCCCGTCCAATATTCGGAAAACGAGAACTGGACGAACTGAACTTACTTCTTTGTTGATGCACGGGAAGGATCTTCAGCTTATCACCGTCGCGTCGATCAAGTTACTGACTCACAATTCTTCTTTCTCTTCGAGGTCCTTTTCTAGATTTGTAAAGTTACGTTAGGTATTAATATCTACCGCATGTTCCGTCCAAAGTAAACGCTCCCCCTAACTGCATTATATTAAGCCGAACCGAACGAAGTTGCGCAGAAACTATGAACGTTTCCGTATTTGCGGAAGATATCTCTCAACTCTCTGCAAACTGAAATAAGCCAGTGAATATAACAATGAACGTTTCCGTATTTGCGGAAGATATCTCTCAACTCTCTGCAAACTGAAATAAGCCAGTGAATATAACAATAGAAAACCTTCGCACCTTACATTCGCGTCCATTAGGTATGCAGGCAGTTCGGCCGGGCCGAAGAATAAGAAGCCACCCCAACTCTGCAAAAAAA\n"
     ]
    }
   ],
   "source": [
    "from evo2.utils import make_phylotag_from_gbif\n",
    "\n",
    "species = 'Phascolarctos cinereus' # Koala bear\n",
    "\n",
    "species_tag_prompt = make_phylotag_from_gbif(species)\n",
    "\n",
    "print(f\"Species tag prompt: {species_tag_prompt}\") # Check if the GBIF API returned a valid species tag!\n",
    "\n",
    "# Generate species tag\n",
    "koala_sequence = model.generate(\n",
    "    [species_tag_prompt],\n",
    "    n_tokens=500,\n",
    "    temperature=1.0,\n",
    ")\n",
    "\n",
    "print(f\"Koala sequence:\")\n",
    "print(koala_sequence.sequences[0])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "hf_tracking_test",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
